##########################################
# R语言入门与回归分析基础                 
# 严海兵 华东政法大学政治学与公共管理学院 
##########################################


### 内容提要
### 1.下载安装
### 2.命令、对象和函数
### 3.数据类型与数据结构
### 4.读入、查看和保存数据
### 5.数据处理
### 6.数据可视化
### 7.回归分析


### 1.下载安装
## 下载R软件:
# 官方网站the Comprehensive R Archive Network (CRAN): https://cran.r-project.org
# Windows用户点击 "Download R for Windows", 然后点击"base", 然后再点击"Download R x.x.x for Windows" 
# Mac用户点击"Download R for (Mac) OS X", 然后点击"R-x.x.x.pkg"

# 注意:如果下载速度比较慢,你可以从本地镜像中下载
# 方法:点击CRAN网站首页左上角的 "Mirrors", 然后找到中国选择一个镜像下载。

## 安装R:
# 对于windows用户, 双击“R-x.x.x-win.exe”, 然后按照下列步骤安装: 
# 选择语言(建议选英语)--> click OK --> click Next --> click Next 
# --> Select destination Location (change “C:\Program Files\R\R-x.x.x” into “C:\Program Files\R\”) and click Next 
# --> select Core Files, 64-bit Files(如果电脑是32位系统则选32-bit Files), and Message translation, then click Next
# --> select “No” and click Next --> select start menu folder and click Next 
# --> select additional tasks and click Next --> click Finish

# 对于Mac用户, 双击 “R-x.x.x.pkg”, 然后按照下列步骤安装: 
# click Continue --> click Continue --> click Continue --> click Continue 
# --> click Agree --> click Install --> click Close to finish the installation

# 熟悉R:
# 打开刚才安装的R软件
# 输入代码 print("Hello World") 或"Hello World"
# 要重复相同的代码,不必重新输入,只需要按光标上键,然后回车

## 安装RStudio:
# 从以下官网下载RStudio: https://www.rstudio.com, 然后安装
# 设置本地镜像:点击Tools --> Global Options --> Packages --> Change --> 选择一个中国镜像
# 更改界面显示:点击Tools --> Global Options --> Appearance
# 更改窗口排列:点击Tools --> Global Options --> Pane Layout
  

### 2. 命令、对象和函数
## 基础命令
# 作为R命令的数学运算
3 + 3
## [1] 6
3 * 3
## [1] 9
3 ^ 3
## [1] 27
# 运算符号:
# 加号(+);
# 减号(-);
# 乘号(*);
# 除号(/);
# 乘方(^)

# 运算顺序:
# 首先算括号, 其次算乘方,然后算乘除,最后算加减
6 + 15 * 22
## [1] 336
(6 + 15) * 22
## [1] 462
22 ^ 2 + 7
## [1] 491
22 ^ (2 + 7)
## [1] 1.207269e+12
# 科学计数法: 1.207269e + 12 = 1.207269 × 10^12 = 1, 207, 269, 000, 000 


## 对象(Objects)
# 对象是用来存储数据的名称(又称变量)
# 例如,把3+3的结果保存为x,x就是一个对象
x <- 3 + 3
# 输入x就会返回这个对象的结果
x
## [1] 6
# 注意:
# (1) <- 又叫赋值运算符,等号(=)也可以用来赋值,但不提倡,因为容易造成混淆
# (2) Rstudio中输入<-的快捷方式:mac为同时按option和减号,win为同时按alt和减号
# (3) 在R中对象名称的字母是区分大小写的,比如大写X和小写x表示的是不同的对象
# (4) 当用同样的名称保存另外一个对象时,之前保存的对象就会被替换
x <- 3 + 2
x 
## [1] 5
x2 <- (5 + 1) ^ 2
x2
## [1] 36
x + x2
## [1] 41
x2 / x
## [1] 7.2
# 保存两个对象运算的结果:
x3 <- x2 / x
x3
## [1] 7.2
# 练习1
# (1)   把数字7保存为对象z1;
# (2)   把10+5保存为对象z2;
# (3)   把z1乘z2保存为对象z3;
# (4)   对z3减1进行平方。


# 保存并直接输出结果
n1 <- 2 * 6; n1
## [1] 12
(n2 <- 24 / 3)
## [1] 8
# 注意:R对象的名称可以由字母 (a-z, A-Z)、数字 (0-9)、点 (.)和下划线 (_)组成。
# 第一个元素必须是字母或点 
# 三种通常的命名方式:
# (1)   total_income
# (2)   total.income
# (3)   totalIncome


## 函数(Functions)
# R拥有大量内置的函数,函数由函数名、圆括号和参数(argument)组成。
log(27)   # 27的自然对数
## [1] 3.295837
sqrt(225) # 225的平方根
## [1] 15
abs(-9)   # -9的绝对值
## [1] 9
# 函数和对象可以同时使用:
y1 <- 20 + 5
sqrt(y1 + 200)
## [1] 15
# 用函数建立一个新的对象:
y2 <- sqrt(y1 + 1200)
y2
## [1] 35
# 用打印函数输出结果
print(y2)   
## [1] 35
# 包含参数的函数:
round(x = 14.5378, digits = 2)
## [1] 14.54
round(14.5378, 1)  # 简化书写
## [1] 14.5
# 参数的默认值:
round(14.5378)
## [1] 15
# 函数的嵌套运用
round(sqrt(20), 2)
## [1] 4.47
# function()也是一个函数
new_func <- function(x){
  (x + 5) * 6
}
new_func(2)
## [1] 42
# 创建一个n以内数字连续相加的函数
sum_func <- function(n){
  x <- 0
  for(i in 1:n)
    x = x + i
    print(x)
}
sum_func(100)
## [1] 5050
# 练习2
# 使用我们前面练习中存储的对象z1, z2和z3做下面的练习:
# (1)   用函数log10()来计算以10为底的z1的对数;
# (2)   用函数sqrt()来计算z2的平方根;
# (3)   用函数sum()来计算z1,z2与z3的和(提示:在括号中用逗号把三个对象隔开);
# (4) 把66.83596存储为对象z4,然后用函数round()保留小数点后三位数。


## 正态分布函数
# 随机生成30个正态分布的数字
rnorm(30) 
##  [1] -0.02904121  0.64783878 -0.45181684  1.97815402  0.44764250
##  [6]  0.32306779 -0.81529937  1.93400570  0.12998041 -0.89175925
## [11] -0.19041786  0.17364719 -0.97593036 -0.53539099  0.65129563
## [16] -1.35776274  1.72278031  0.15262557  1.15766988 -1.05275524
## [21] -0.79137832 -1.11829970  1.34530304 -0.98634110 -0.80232575
## [26] -0.66748559 -0.85004735  0.16331619 -1.47237205 -0.39387200
# 设定种子使结果可复现
set.seed(321)
my_norm <- rnorm(30); my_norm
##  [1]  1.70490322 -0.71203856 -0.27798491 -0.11964902 -0.12396062
##  [6]  0.26818377  0.72684149  0.23313541  0.33911388 -0.55191467
## [11]  0.34770136  1.48459181  0.18832552  2.44325982 -1.15343949
## [16] -0.80467168  0.45606915  0.42033257  0.57758450  0.44635606
## [21]  0.91725553 -0.10706154  0.98833540 -1.07223880 -0.75801528
## [26]  0.09500072 -2.33093117  0.41751598 -1.12032742 -0.47468470
# 用hist()函数绘制直方图
hist(my_norm)

# 查看函数的参数(arguments)
help(rnorm) 
# 或者用问号
?rnorm
my_data <- rnorm(100, mean = 5, sd = 3); my_data
##   [1]  0.4087635  6.2471445  6.9025938  8.6925421  4.5363090  5.3436183
##   [7] -1.6787899  9.9073710  4.5220378  5.0848024  0.3900401  5.2344734
##  [13]  5.7511874  5.7322616  7.3992147  6.0242287  5.7761530  7.8616359
##  [19]  8.4092442  2.0471359  2.2710361  2.3327239  4.2519794  5.9625528
##  [25]  3.0038918  1.9732504  6.0236048  6.1651935 10.3104926  5.6379864
##  [31]  9.3860519  5.1654673  8.7445352  2.0058746  8.5127430 -2.4897467
##  [37]  3.8205627  4.5096688  7.1264683  1.5131587  2.0223276  3.8838382
##  [43]  4.7315165  6.1098138  6.4832296  4.4715424  6.7702906  9.2180603
##  [49]  6.7023223  5.2225075  2.6491256  6.1992028  5.3603921  3.0333240
##  [55]  8.9812296  1.3670394 -0.4721249  2.8140450  4.7844491  4.7552152
##  [61]  0.5682751  4.3471364  7.2002030  0.9758089  2.7089801 11.8635250
##  [67]  6.8955875  7.0949944  5.0539836  2.7622322  1.0895282  2.2438911
##  [73]  8.3998980  7.4535358  1.4772297  4.7346048  7.9668772  6.9718932
##  [79]  4.3593921  3.9717575  0.3089843  6.5590692  5.7485312  6.2740564
##  [85]  9.6300191  6.0126479  4.1671842  5.4993752  1.8932725  4.1586340
##  [91]  7.8904932  2.5894741  2.8658086  1.8995941  6.7715629  7.7619308
##  [97]  5.7097055  8.4391624  5.6874211  3.2851195
hist(my_data)

# 设置绘图函数的参数
?hist
hist(my_data,                                   # 设置数据
     col = "lightblue",                         # 填充条的颜色
     border = "black",                          # 设置条边框的颜色
     main = "Histogram of 100 Random Numbers",  # 设置主标题
     xlab = "Random Numbers")                   # 设置横坐标的名称


# 练习3
# 使用runif()函数生成100个均匀分布的数字并保存为vec_unif
# 然后用vec_unif绘制一个直方图,并添加颜色


### 3. 数据类型与数据结构
# 查看数据类型
class(my_data)
## [1] "numeric"
# 向量(vector):相同数据类型构成的一维数据结构
is.vector(my_data)
## [1] TRUE
# 创建三个不同类型的向量
# 字符型向量
name <- c("Zhao", "Qian", "Sun", "Li")
class(name)
## [1] "character"
# 数值型向量
score <- c(517, 413, 306, 489)
class(score)
## [1] "numeric"
typeof(score)
## [1] "double"
# 逻辑型向量
pass <- c(TRUE, FALSE, FALSE, TRUE)
class(pass)
## [1] "logical"
# 利用向量创建一个数据框(data frame)
# 数据框:由行和列组成的二维数据结构,每列的数据类型必须相同
student <- data.frame(name, score, pass); student
##   name score  pass
## 1 Zhao   517  TRUE
## 2 Qian   413 FALSE
## 3  Sun   306 FALSE
## 4   Li   489  TRUE
# 查看数据结构
str(student)
## 'data.frame':    4 obs. of  3 variables:
##  $ name : Factor w/ 4 levels "Li","Qian","Sun",..: 4 2 3 1
##  $ score: num  517 413 306 489
##  $ pass : logi  TRUE FALSE FALSE TRUE
# Factor(因子)是一种特殊的向量,用来存储类别变量
# data.frame()函数创建数据框时自动把字符型向量转变成因子
# 可以通过设置参数阻止转换
student <- data.frame(name, score, pass, stringsAsFactors = FALSE)
str(student)
## 'data.frame':    4 obs. of  3 variables:
##  $ name : chr  "Zhao" "Qian" "Sun" "Li"
##  $ score: num  517 413 306 489
##  $ pass : logi  TRUE FALSE FALSE TRUE
# 美元符号$: 选取某个变量
student$name
## [1] "Zhao" "Qian" "Sun"  "Li"
student$score
## [1] 517 413 306 489
### 4. 读入、查看和保存数据
# R包是拓展R功能的软件(代码库),可以通过install.packages()函数从官网CRAN下载各种包
# R和R包的关系类似手机和手机APP的关系
# 下载gapminder数据包
# install.packages("gapminder")
# 加载/调用R包
library(gapminder)
# 或者
require(gapminder)


## 查看gapminder数据信息
str(gapminder)                                # 查看数据结构
## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
names(gapminder)                              # 查看变量名称
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
head(gapminder)                               # 查看前6行数据
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
tail(gapminder)                               # 查看最后6行数据
## # A tibble: 6 x 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.
head(gapminder, 3)                            # 查看前3行数据
## # A tibble: 3 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
?gapminder                                    # 查看说明文档


## 查看基本的统计信息
summary(gapminder)                            # 查看所有变量的基本统计信息
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
summary(gapminder$lifeExp)                    # 查看某个变量的基本统计信息
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.60   48.20   60.71   59.47   70.85   82.60
mean(gapminder$gdpPercap, na.rm = TRUE)       # 计算某个变量的均值
## [1] 7215.327
table(gapminder$continent)                    # 计算类别变量的频数
## 
##   Africa Americas     Asia   Europe  Oceania 
##      624      300      396      360       24
table(gapminder$year)                         # 计算年份的频数
## 
## 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007 
##  142  142  142  142  142  142  142  142  142  142  142  142
prop.table(table(gapminder$continent))        # 计算各类别频数的占比
## 
##     Africa   Americas       Asia     Europe    Oceania 
## 0.36619718 0.17605634 0.23239437 0.21126761 0.01408451
quantile(gapminder$lifeExp)                   # 计算变量的分位数(0%,25%,50%,75%和100%)
##      0%     25%     50%     75%    100% 
## 23.5990 48.1980 60.7125 70.8455 82.6030
quantile(gapminder$lifeExp, 0.85)             # 计算变量的85%分位数
##     85% 
## 73.4755
quantile(gapminder$lifeExp, seq(0.1, 1, 0.1)) # 计算间隔为10%的分位数
##     10%     20%     30%     40%     50%     60%     70%     80%     90% 
## 41.5108 45.8992 50.6021 55.7292 60.7125 66.0814 69.7465 72.0288 75.0970 
##    100% 
## 82.6030
## 读入外部数据
# readr: 读取csv和fwf文档
# readxl:读取xls和xlsx文档
# haven:读取SAS、SPSS和Stata文档
# install.packages(c("readr","readxl"))
library(readr)
library(readxl)

# 将gapminder保存为csv文档
write_csv(gapminder, "gapminder.csv")
# 读入csv文档
read_csv("gapminder.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_double(),
##   lifeExp = col_double(),
##   pop = col_double(),
##   gdpPercap = col_double()
## )
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <chr>       <chr>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
# 将csv另存为excel格式
# 读入excel文档
read_excel("gapminder.xlsx")
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <chr>       <chr>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
### 5. 数据处理
## tidyverse
# tidyverse既是一种R编程风格, 也是一组R包的集合
# 作为R包集合, tidyverse包含的核心包如下:
# ggplot2 - 数据可视化
# dplyr - 数据转换
# tidyr - 数据整洁(清洗)
# readr - 读入矩形数据(csv,tsv和fwf格式的数据)
# purrr - 函数编程
# tibble - 新型数据框
# stringr - 字符数据处理
# forcats - 因子数据处理

# 整体安装和加载tidyverse系列包
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.1     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ ggplot2 3.2.0     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

## tibble
# 不把字符自动转成因子
tibble(name = c("Zhao", "Qian", "Sun", "Li"),
       score = c(517, 413, 306, 489), 
       pass = c(TRUE, FALSE, FALSE, TRUE))
## # A tibble: 4 x 3
##   name  score pass 
##   <chr> <dbl> <lgl>
## 1 Zhao    517 TRUE 
## 2 Qian    413 FALSE
## 3 Sun     306 FALSE
## 4 Li      489 TRUE
## dplyr
# dplyr是tidyverse系列包的核心组成部分,主要用于数据转换(管理)
# dplyr包的核心函数:
# select(), rename(), filter(), arrange(), mutate(), summarize()以及%>%
# 也可以单独加载dplyr包
# install.packages("dplyr")
# library(dplyr)

## select(): 选取特定变量的数据
select(gapminder, country, year, lifeExp)  # 选取country, year和lifeExp
## # A tibble: 1,704 x 3
##    country      year lifeExp
##    <fct>       <int>   <dbl>
##  1 Afghanistan  1952    28.8
##  2 Afghanistan  1957    30.3
##  3 Afghanistan  1962    32.0
##  4 Afghanistan  1967    34.0
##  5 Afghanistan  1972    36.1
##  6 Afghanistan  1977    38.4
##  7 Afghanistan  1982    39.9
##  8 Afghanistan  1987    40.8
##  9 Afghanistan  1992    41.7
## 10 Afghanistan  1997    41.8
## # … with 1,694 more rows
select(gapminder, country:lifeExp)         # 选取从country到lifeExp的所有变量
## # A tibble: 1,704 x 4
##    country     continent  year lifeExp
##    <fct>       <fct>     <int>   <dbl>
##  1 Afghanistan Asia       1952    28.8
##  2 Afghanistan Asia       1957    30.3
##  3 Afghanistan Asia       1962    32.0
##  4 Afghanistan Asia       1967    34.0
##  5 Afghanistan Asia       1972    36.1
##  6 Afghanistan Asia       1977    38.4
##  7 Afghanistan Asia       1982    39.9
##  8 Afghanistan Asia       1987    40.8
##  9 Afghanistan Asia       1992    41.7
## 10 Afghanistan Asia       1997    41.8
## # … with 1,694 more rows
select(gapminder, -(lifeExp:gdpPercap))    # 选取从lifeExp到gdpPercap之外的所有变量
## # A tibble: 1,704 x 3
##    country     continent  year
##    <fct>       <fct>     <int>
##  1 Afghanistan Asia       1952
##  2 Afghanistan Asia       1957
##  3 Afghanistan Asia       1962
##  4 Afghanistan Asia       1967
##  5 Afghanistan Asia       1972
##  6 Afghanistan Asia       1977
##  7 Afghanistan Asia       1982
##  8 Afghanistan Asia       1987
##  9 Afghanistan Asia       1992
## 10 Afghanistan Asia       1997
## # … with 1,694 more rows
select(gapminder, 1, 3, 4)                 # 选取第1、3、4行的变量
## # A tibble: 1,704 x 3
##    country      year lifeExp
##    <fct>       <int>   <dbl>
##  1 Afghanistan  1952    28.8
##  2 Afghanistan  1957    30.3
##  3 Afghanistan  1962    32.0
##  4 Afghanistan  1967    34.0
##  5 Afghanistan  1972    36.1
##  6 Afghanistan  1977    38.4
##  7 Afghanistan  1982    39.9
##  8 Afghanistan  1987    40.8
##  9 Afghanistan  1992    41.7
## 10 Afghanistan  1997    41.8
## # … with 1,694 more rows
select(gapminder, -c(2, 4))                # 选取第2、4行以外的变量
## # A tibble: 1,704 x 4
##    country      year      pop gdpPercap
##    <fct>       <int>    <int>     <dbl>
##  1 Afghanistan  1952  8425333      779.
##  2 Afghanistan  1957  9240934      821.
##  3 Afghanistan  1962 10267083      853.
##  4 Afghanistan  1967 11537966      836.
##  5 Afghanistan  1972 13079460      740.
##  6 Afghanistan  1977 14880372      786.
##  7 Afghanistan  1982 12881816      978.
##  8 Afghanistan  1987 13867957      852.
##  9 Afghanistan  1992 16317921      649.
## 10 Afghanistan  1997 22227415      635.
## # … with 1,694 more rows
select(gapminder, starts_with("c"))        # 选取以"c"开头的变量
## # A tibble: 1,704 x 2
##    country     continent
##    <fct>       <fct>    
##  1 Afghanistan Asia     
##  2 Afghanistan Asia     
##  3 Afghanistan Asia     
##  4 Afghanistan Asia     
##  5 Afghanistan Asia     
##  6 Afghanistan Asia     
##  7 Afghanistan Asia     
##  8 Afghanistan Asia     
##  9 Afghanistan Asia     
## 10 Afghanistan Asia     
## # … with 1,694 more rows
select(gapminder, ends_with("p"))          # 选取以"p"结尾的变量
## # A tibble: 1,704 x 3
##    lifeExp      pop gdpPercap
##      <dbl>    <int>     <dbl>
##  1    28.8  8425333      779.
##  2    30.3  9240934      821.
##  3    32.0 10267083      853.
##  4    34.0 11537966      836.
##  5    36.1 13079460      740.
##  6    38.4 14880372      786.
##  7    39.9 12881816      978.
##  8    40.8 13867957      852.
##  9    41.7 16317921      649.
## 10    41.8 22227415      635.
## # … with 1,694 more rows
select(gapminder, contains("o"))           # 选取包含"o"的变量
## # A tibble: 1,704 x 3
##    country     continent      pop
##    <fct>       <fct>        <int>
##  1 Afghanistan Asia       8425333
##  2 Afghanistan Asia       9240934
##  3 Afghanistan Asia      10267083
##  4 Afghanistan Asia      11537966
##  5 Afghanistan Asia      13079460
##  6 Afghanistan Asia      14880372
##  7 Afghanistan Asia      12881816
##  8 Afghanistan Asia      13867957
##  9 Afghanistan Asia      16317921
## 10 Afghanistan Asia      22227415
## # … with 1,694 more rows
## rename(): 重新命名变量
rename(gapminder, life_expectancy = lifeExp, population = pop)
## # A tibble: 1,704 x 6
##    country     continent  year life_expectancy population gdpPercap
##    <fct>       <fct>     <int>           <dbl>      <int>     <dbl>
##  1 Afghanistan Asia       1952            28.8    8425333      779.
##  2 Afghanistan Asia       1957            30.3    9240934      821.
##  3 Afghanistan Asia       1962            32.0   10267083      853.
##  4 Afghanistan Asia       1967            34.0   11537966      836.
##  5 Afghanistan Asia       1972            36.1   13079460      740.
##  6 Afghanistan Asia       1977            38.4   14880372      786.
##  7 Afghanistan Asia       1982            39.9   12881816      978.
##  8 Afghanistan Asia       1987            40.8   13867957      852.
##  9 Afghanistan Asia       1992            41.7   16317921      649.
## 10 Afghanistan Asia       1997            41.8   22227415      635.
## # … with 1,694 more rows
# 注意: select() 也可以对变量重新命名,但是会抛弃未重新命名的其他变量
select(gapminder, life_expectancy = lifeExp, population = pop)
## # A tibble: 1,704 x 2
##    life_expectancy population
##              <dbl>      <int>
##  1            28.8    8425333
##  2            30.3    9240934
##  3            32.0   10267083
##  4            34.0   11537966
##  5            36.1   13079460
##  6            38.4   14880372
##  7            39.9   12881816
##  8            40.8   13867957
##  9            41.7   16317921
## 10            41.8   22227415
## # … with 1,694 more rows
## filter(): 根据逻辑条件选取特定行的数据
# 常见的逻辑运算符号及其含义
############################
#   符号   #     含义       
#    <     #     小于       
#   <=     #  小于或等于    
#    >     #     大于       
#   >=     #  大于或等于    
#   ==     #     等于       
#   !=     #   不等于       
#   !x     #     非x        
#  x | y   #    x或y        
#  x & y   #    x且y        
# x %in% y # 检验x是否包含y 
#############################
filter(gapminder, country == "China")
## # A tibble: 12 x 6
##    country continent  year lifeExp        pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
##  1 China   Asia       1952    44    556263527      400.
##  2 China   Asia       1957    50.5  637408000      576.
##  3 China   Asia       1962    44.5  665770000      488.
##  4 China   Asia       1967    58.4  754550000      613.
##  5 China   Asia       1972    63.1  862030000      677.
##  6 China   Asia       1977    64.0  943455000      741.
##  7 China   Asia       1982    65.5 1000281000      962.
##  8 China   Asia       1987    67.3 1084035000     1379.
##  9 China   Asia       1992    68.7 1164970000     1656.
## 10 China   Asia       1997    70.4 1230075000     2289.
## 11 China   Asia       2002    72.0 1280400000     3119.
## 12 China   Asia       2007    73.0 1318683096     4959.
filter(gapminder, continent == "Asia", year == 2007)
## # A tibble: 33 x 6
##    country          continent  year lifeExp        pop gdpPercap
##    <fct>            <fct>     <int>   <dbl>      <int>     <dbl>
##  1 Afghanistan      Asia       2007    43.8   31889923      975.
##  2 Bahrain          Asia       2007    75.6     708573    29796.
##  3 Bangladesh       Asia       2007    64.1  150448339     1391.
##  4 Cambodia         Asia       2007    59.7   14131858     1714.
##  5 China            Asia       2007    73.0 1318683096     4959.
##  6 Hong Kong, China Asia       2007    82.2    6980412    39725.
##  7 India            Asia       2007    64.7 1110396331     2452.
##  8 Indonesia        Asia       2007    70.6  223547000     3541.
##  9 Iran             Asia       2007    71.0   69453570    11606.
## 10 Iraq             Asia       2007    59.5   27499638     4471.
## # … with 23 more rows
filter(gapminder, lifeExp < 30)
## # A tibble: 2 x 6
##   country     continent  year lifeExp     pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8 8425333      779.
## 2 Rwanda      Africa     1992    23.6 7290203      737.
filter(gapminder, country %in% c("China", "Japan", "United States"))
## # A tibble: 36 x 6
##    country continent  year lifeExp        pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
##  1 China   Asia       1952    44    556263527      400.
##  2 China   Asia       1957    50.5  637408000      576.
##  3 China   Asia       1962    44.5  665770000      488.
##  4 China   Asia       1967    58.4  754550000      613.
##  5 China   Asia       1972    63.1  862030000      677.
##  6 China   Asia       1977    64.0  943455000      741.
##  7 China   Asia       1982    65.5 1000281000      962.
##  8 China   Asia       1987    67.3 1084035000     1379.
##  9 China   Asia       1992    68.7 1164970000     1656.
## 10 China   Asia       1997    70.4 1230075000     2289.
## # … with 26 more rows
## arrange(): 根据变量(或列)对行进行重新排序
arrange(gapminder, pop)
## # A tibble: 1,704 x 6
##    country               continent  year lifeExp   pop gdpPercap
##    <fct>                 <fct>     <int>   <dbl> <int>     <dbl>
##  1 Sao Tome and Principe Africa     1952    46.5 60011      880.
##  2 Sao Tome and Principe Africa     1957    48.9 61325      861.
##  3 Djibouti              Africa     1952    34.8 63149     2670.
##  4 Sao Tome and Principe Africa     1962    51.9 65345     1072.
##  5 Sao Tome and Principe Africa     1967    54.4 70787     1385.
##  6 Djibouti              Africa     1957    37.3 71851     2865.
##  7 Sao Tome and Principe Africa     1972    56.5 76595     1533.
##  8 Sao Tome and Principe Africa     1977    58.6 86796     1738.
##  9 Djibouti              Africa     1962    39.7 89898     3021.
## 10 Sao Tome and Principe Africa     1982    60.4 98593     1890.
## # … with 1,694 more rows
arrange(gapminder, desc(pop))
## # A tibble: 1,704 x 6
##    country continent  year lifeExp        pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
##  1 China   Asia       2007    73.0 1318683096     4959.
##  2 China   Asia       2002    72.0 1280400000     3119.
##  3 China   Asia       1997    70.4 1230075000     2289.
##  4 China   Asia       1992    68.7 1164970000     1656.
##  5 India   Asia       2007    64.7 1110396331     2452.
##  6 China   Asia       1987    67.3 1084035000     1379.
##  7 India   Asia       2002    62.9 1034172547     1747.
##  8 China   Asia       1982    65.5 1000281000      962.
##  9 India   Asia       1997    61.8  959000000     1459.
## 10 China   Asia       1977    64.0  943455000      741.
## # … with 1,694 more rows
# 先按gdpPercap再按pop排序
arrange(gapminder, gdpPercap, pop) 
## # A tibble: 1,704 x 6
##    country          continent  year lifeExp      pop gdpPercap
##    <fct>            <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Congo, Dem. Rep. Africa     2002    45.0 55379852      241.
##  2 Congo, Dem. Rep. Africa     2007    46.5 64606759      278.
##  3 Lesotho          Africa     1952    42.1   748747      299.
##  4 Guinea-Bissau    Africa     1952    32.5   580653      300.
##  5 Congo, Dem. Rep. Africa     1997    42.6 47798986      312.
##  6 Eritrea          Africa     1952    35.9  1438760      329.
##  7 Myanmar          Asia       1952    36.3 20092996      331 
##  8 Lesotho          Africa     1957    45.0   813338      336.
##  9 Burundi          Africa     1952    39.0  2445618      339.
## 10 Eritrea          Africa     1957    38.0  1542611      344.
## # … with 1,694 more rows
# 注意: 如果有缺失值(NA), 缺失值总是排在最后

## mutate(): 基于现有变量创建新的变量
mutate(gapminder,  gdp = gdpPercap*pop)
## # A tibble: 1,704 x 7
##    country     continent  year lifeExp      pop gdpPercap          gdp
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.  6567086330.
##  2 Afghanistan Asia       1957    30.3  9240934      821.  7585448670.
##  3 Afghanistan Asia       1962    32.0 10267083      853.  8758855797.
##  4 Afghanistan Asia       1967    34.0 11537966      836.  9648014150.
##  5 Afghanistan Asia       1972    36.1 13079460      740.  9678553274.
##  6 Afghanistan Asia       1977    38.4 14880372      786. 11697659231.
##  7 Afghanistan Asia       1982    39.9 12881816      978. 12598563401.
##  8 Afghanistan Asia       1987    40.8 13867957      852. 11820990309.
##  9 Afghanistan Asia       1992    41.7 16317921      649. 10595901589.
## 10 Afghanistan Asia       1997    41.8 22227415      635. 14121995875.
## # … with 1,694 more rows
# 注意: transmute()函数也可以创建新的变量, 但是会抛弃其他变量
transmute(gapminder, gdp = gdpPercap*pop)
## # A tibble: 1,704 x 1
##             gdp
##           <dbl>
##  1  6567086330.
##  2  7585448670.
##  3  8758855797.
##  4  9648014150.
##  5  9678553274.
##  6 11697659231.
##  7 12598563401.
##  8 11820990309.
##  9 10595901589.
## 10 14121995875.
## # … with 1,694 more rows
# mutate()与case_when()联合使用, 可以对变量进行重新编码
gapminder2007 <- filter(gapminder, year == 2007)
summary(gapminder2007$gdpPercap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   277.6  1624.8  6124.4 11680.1 18008.8 49357.2
gapminder2007 <- mutate(
  gapminder2007, 
  gdpPercap_class = case_when(
    gdpPercap >= 18008.8 ~ "High Income",
    gdpPercap < 1624.8 ~ "Low Income",
    gdpPercap >= 1624.8 & gdpPercap < 18008.8 ~ "Middle Income"
    )
  )

## summarize()/summarise(): 用于计算变量的描述统计值
summarize(gapminder, mean(gdpPercap))
## # A tibble: 1 x 1
##   `mean(gdpPercap)`
##               <dbl>
## 1             7215.
summarize(gapminder, 
          lifeExp_avg = mean(lifeExp),
          pop_avg = mean(pop),
          gdpPercap_avg = mean(gdpPercap))
## # A tibble: 1 x 3
##   lifeExp_avg   pop_avg gdpPercap_avg
##         <dbl>     <dbl>         <dbl>
## 1        59.5 29601212.         7215.
# summarize()通常结合group_by()一起使用
# 按照年份分组
gapminder_year <- group_by(gapminder, year)
summarize(gapminder_year, 
          lifeExp_avg = mean(lifeExp),
          lifeExp_med = median(lifeExp))
## # A tibble: 12 x 3
##     year lifeExp_avg lifeExp_med
##    <int>       <dbl>       <dbl>
##  1  1952        49.1        45.1
##  2  1957        51.5        48.4
##  3  1962        53.6        50.9
##  4  1967        55.7        53.8
##  5  1972        57.6        56.5
##  6  1977        59.6        59.7
##  7  1982        61.5        62.4
##  8  1987        63.2        65.8
##  9  1992        64.2        67.7
## 10  1997        65.0        69.4
## 11  2002        65.7        70.8
## 12  2007        67.0        71.9
# 按照各洲分组
gapminder_continent <- group_by(gapminder, continent)
summarize(gapminder_continent, 
          count = n(), 
          lifeExp_min = min(lifeExp),
          lifeExp_max = max(lifeExp),
          lifeExp_avg = mean(lifeExp))
## # A tibble: 5 x 5
##   continent count lifeExp_min lifeExp_max lifeExp_avg
##   <fct>     <int>       <dbl>       <dbl>       <dbl>
## 1 Africa      624        23.6        76.4        48.9
## 2 Americas    300        37.6        80.7        64.7
## 3 Asia        396        28.8        82.6        60.1
## 4 Europe      360        43.6        81.8        71.9
## 5 Oceania      24        69.1        81.2        74.3
## %>%: 管道操作符
gapminder %>% 
  filter(year == 2007) %>% 
  group_by(continent) %>%
  summarize(count = n(), 
            lifeExp_min = min(lifeExp),
            lifeExp_max = max(lifeExp),
            lifeExp_avg = mean(lifeExp))
## # A tibble: 5 x 5
##   continent count lifeExp_min lifeExp_max lifeExp_avg
##   <fct>     <int>       <dbl>       <dbl>       <dbl>
## 1 Africa       52        39.6        76.4        54.8
## 2 Americas     25        60.9        80.7        73.6
## 3 Asia         33        43.8        82.6        70.7
## 4 Europe       30        71.8        81.8        77.6
## 5 Oceania       2        80.2        81.2        80.7
# 注: 如果%>%不起作用, 可以安装并加载magrittr

## 快捷键
# Rstudio shortcut for %>%
# mac: shift + command + M
# win: shift + control + M


### 6. 数据可视化
library(ggplot2)

## ggplot2语法的关键构件:
# 1. 用于绘图的DATA(数据层)
# 2. 数据映射AESTHETIC(美学层)
# 3. 图形对象GEOMETRIC(图形层)

## 绘图模版
# ggplot(data = DATA, mapping = aes(MAPPINGS)) + GEOM_FUCTION()
# 或者
# ggplot(data = DATA) + GEOM_FUCTION(mapping = aes(MAPPINGS))
# 或者
# p <- ggplot(data = DATA, mapping = aes(MAPPINGS))
# p + GEOM_FUCTION()

## 1.直方图
ggplot(data = gapminder2007, mapping = aes(x = lifeExp))

# 添加图形
ggplot(gapminder2007, aes(lifeExp)) + geom_histogram()  
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 调整条的宽度
ggplot(gapminder2007, aes(lifeExp)) + geom_histogram(bins = 20)   # 调整数量

ggplot(gapminder, aes(lifeExp)) + geom_histogram(binwidth = 6)    # 调整宽度

# 设置条及其边框的颜色
ggplot(gapminder2007, aes(lifeExp)) + 
  geom_histogram(fill = "lightblue", color = "black") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 添加主标题/x轴标签/更换主题模版
ggplot(gapminder2007, aes(lifeExp)) + 
  geom_histogram(fill = "lightblue", color = "white") +
  ggtitle("Distribution of Life Expectancy, 2007") +
  xlab("Life Expectancy") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 条形图
ggplot(gapminder2007, aes(continent)) + geom_bar()

ggplot(gapminder2007, aes(continent, fill = continent)) + 
  geom_bar(alpha = 0.5) +
  ggtitle("Number of Countries on Each Continent") + 
  xlab("") + guides(fill = FALSE) + 
  theme_minimal() +
  theme(panel.grid.major.x = element_blank())

gapminder2007 %>% 
  count(continent) %>% 
  mutate(continent = reorder(continent, n)) %>% 
  ggplot(aes(continent, n)) + 
  geom_col(fill = "skyblue3", alpha = 0.8) +
  labs(title = "Number of Countries on Each Continent",
       x = "", y = "Frequency") +
  theme_bw()

## 箱线图
ggplot(gapminder2007, aes(continent, lifeExp)) + geom_boxplot()

ggplot(gapminder2007, aes(reorder(continent, lifeExp, median), lifeExp, fill = continent)) +
  geom_boxplot(alpha = 0.6) +  
  labs(title = "Life Expectancy Comparison, 2007", 
       xlab ="", y = "Life Expectancy") + 
  guides(fill = FALSE) +
  theme_minimal()

## 折线图
gapminder %>% ggplot(aes(year, lifeExp, group = country)) + 
  geom_line()

gapminder %>% 
  filter(country %in% c("Rwanda", "Iraq")) %>% 
  ggplot(aes(year, lifeExp, group = country, color = country)) + 
  geom_line(size = 1.2) +
  labs(title = "Two Countries Comparison", 
       x = "", y= "Life Expectancy") +
  theme_minimal()

## 散点图
p <- ggplot(gapminder2007, aes(gdpPercap, lifeExp)) + geom_point(shape = 1); p 

p + geom_smooth()  # 添加loess曲线 (defalt: method = loess)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(gapminder2007, aes(log(gdpPercap), lifeExp)) + 
  geom_point(shape = 1) +
  geom_smooth(method = lm)  # 添加回归线

ggplot(gapminder2007, aes(log(gdpPercap), lifeExp, color = continent)) + 
  geom_point(size = 3.5, alpha = 0.5) +
  geom_smooth(method = lm, se = FALSE) +
  labs(x = "GDP per capita (log)", y = "Life Expectancy") +
  theme_minimal()

gapminder2007 %>% 
  filter(continent != "Oceania") %>% 
  ggplot(aes(log(gdpPercap), lifeExp)) + 
  geom_point(size = 3.5, alpha = 0.5) +
  geom_smooth(method = lm, se = FALSE) +
  labs(x = "GDP per capita (log)", y = "Life Expectancy") +
  theme_bw() +
  facet_wrap(~ continent) 

## 保存图形
# 安装和加载showtext包用于显示中文字体
# install.packages("showtext")  
library(showtext)   
## Loading required package: sysfonts
## Loading required package: showtextdb
showtext_auto(enable = TRUE)
# 保存最近绘制的一张图
ggsave("my_graph.pdf", width = 10, height = 8)  
ggsave("my_graph.png", width = 7, height = 4)
# 保存已存储在r上的图形
my_plot <- ggplot(gapminder2007, aes(lifeExp, fill = gdpPercap_class)) + 
  geom_density(alpha = 0.6) + theme_bw(); my_plot

ggsave("my_graph2.pdf", plot = my_plot, width = 10, height = 8)


## 练习4 
# 使用ggplot自带的数据集diamonds绘制上述几种图形(折线图除外)


### 7. 回归分析
## 简单回归
# 简单回归模型: Yi = β0 + β*Xi + εi
# i = 1,…,n
# Yi = 因变量的取值
# β0 = 截距(intercept)
# β = 斜率(slope)
# Xi = 自变量的取值
# εi = 随机方差或残差

# 自变量为数值变量(连续变量)
fit <- lm(lifeExp ~ gdpPercap, data = gapminder2007)
summary(fit)  
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap, data = gapminder2007)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.828  -6.316   1.922   6.898  13.128 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.957e+01  1.010e+00   58.95   <2e-16 ***
## gdpPercap   6.371e-04  5.827e-05   10.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.899 on 140 degrees of freedom
## Multiple R-squared:  0.4606, Adjusted R-squared:  0.4567 
## F-statistic: 119.5 on 1 and 140 DF,  p-value: < 2.2e-16
# 解读:
# Estimate: 自变量gdpPercap每增加1个单位,因变量lifeExp平均增加0.637岁
# p-value: 在0.05显著性水平下是显著的
# R-squared: 自变量能够解释的因变量变化的比例
fit <- lm(lifeExp ~ log(gdpPercap), data = gapminder2007)
summary(fit)  
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = gapminder2007)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.947  -2.661   1.215   4.469  13.115 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.9496     3.8577   1.283    0.202    
## log(gdpPercap)   7.2028     0.4423  16.283   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.122 on 140 degrees of freedom
## Multiple R-squared:  0.6544, Adjusted R-squared:  0.652 
## F-statistic: 265.2 on 1 and 140 DF,  p-value: < 2.2e-16
# NOTE: 自变量和不同情况下取对数的解释
# 因变量Y取对数,自变量X不取对数,参数β解释为:x变动1个单位,y平均变动100*β%
# 因变量Y不取对数,自变量X取对数,参数β解释为:x变动1%,y平均变动0.01*β个单位
# 因变量Y取对数,自变量X也取对数,参数β解释为:x变动1%,y平均变动β%

# R方:回归线 vs. 均值线
gapminder2007 %>%
  ggplot(aes(log(gdpPercap), lifeExp)) + 
  geom_point(size = 3, alpha = 0.5) + 
  geom_smooth(method = "lm", se = FALSE) + theme_bw() +
  geom_hline(yintercept = mean(gapminder2007$lifeExp), color = "red")

# 自变量为类别变量(连续变量)
# 设定类别变量因子水平的顺序
gapminder2007$gdpPercap_class <- factor(gapminder2007$gdpPercap_class, 
                                        levels = c("Low Income","Middle Income", "High Income"))
fit <- lm(lifeExp ~ gdpPercap_class, data = gapminder2007)
summary(fit)  
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap_class, data = gapminder2007)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.362  -3.502   1.692   4.922  14.172 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    53.125      1.293  41.095   <2e-16 ***
## gdpPercap_classMiddle Income   14.850      1.591   9.335   <2e-16 ***
## gdpPercap_classHigh Income     25.885      1.828  14.159   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.756 on 139 degrees of freedom
## Multiple R-squared:  0.5931, Adjusted R-squared:  0.5873 
## F-statistic: 101.3 on 2 and 139 DF,  p-value: < 2.2e-16
# 解读:以Low Income为基准,因变量的取值在自变量取类别Middle Income时,比取基准类别时平均高14.85岁 (取High Income时高25.885)
# 虚拟变量的原理
class_avg <- gapminder2007 %>%
  group_by(gdpPercap_class) %>%
  summarise(lifeExp_avg = mean(lifeExp))
gapminder2007 %>%
  ggplot(aes(gdpPercap_class, lifeExp)) + 
  geom_point(size = 2, alpha = 0.3) + xlab("") + theme_bw() +
  annotate("segment", color = "red", linetype = "dashed",
           x = c(0, 0, 0), xend = c(1, 2, 3), 
           y = class_avg$lifeExp_avg, yend = class_avg$lifeExp_avg) +
  annotate("text", x = c(0.15, 0.15, 0.15), y = class_avg$lifeExp_avg + 1, 
           label = round(class_avg$lifeExp_avg, 2)) +
  annotate("segment", color = "blue", x = c(1, 1), xend = c(2, 3), 
           y = class_avg$lifeExp_avg[c(1, 1)], yend = class_avg$lifeExp_avg[c(2, 3)])

## 多元回归
# 多元回归中自变量有多个
model1 <- lm(lifeExp ~ log(gdpPercap), data = gapminder2007); summary(model1)
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = gapminder2007)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.947  -2.661   1.215   4.469  13.115 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.9496     3.8577   1.283    0.202    
## log(gdpPercap)   7.2028     0.4423  16.283   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.122 on 140 degrees of freedom
## Multiple R-squared:  0.6544, Adjusted R-squared:  0.652 
## F-statistic: 265.2 on 1 and 140 DF,  p-value: < 2.2e-16
model2 <- lm(lifeExp ~ log(gdpPercap) + log(pop), data = gapminder2007); summary(model2)
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap) + log(pop), data = gapminder2007)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.0435  -2.0693   0.9482   4.7514  12.8927 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -8.6161     7.5379  -1.143   0.2550    
## log(gdpPercap)   7.2445     0.4376  16.555   <2e-16 ***
## log(pop)         0.8114     0.3889   2.086   0.0388 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.038 on 139 degrees of freedom
## Multiple R-squared:  0.6649, Adjusted R-squared:  0.6601 
## F-statistic: 137.9 on 2 and 139 DF,  p-value: < 2.2e-16
model3 <- lm(lifeExp ~ log(gdpPercap) + log(pop) + continent, data = gapminder2007); summary(model3)
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap) + log(pop) + continent, 
##     data = gapminder2007)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4248  -2.2458  -0.0139   2.4683  14.9571 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       19.41824    7.45571   2.604  0.01023 *  
## log(gdpPercap)     4.64155    0.53758   8.634 1.46e-14 ***
## log(pop)           0.04029    0.35073   0.115  0.90871    
## continentAmericas 11.65638    1.69293   6.885 1.98e-10 ***
## continentAsia     10.05215    1.57756   6.372 2.73e-09 ***
## continentEurope   11.23199    1.92647   5.830 3.88e-08 ***
## continentOceania  12.89181    4.54931   2.834  0.00531 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.951 on 135 degrees of freedom
## Multiple R-squared:  0.7674, Adjusted R-squared:  0.7571 
## F-statistic: 74.23 on 6 and 135 DF,  p-value: < 2.2e-16
# 对Estimate的解读增加“在控制其他自变量不变的条件下”,其他表述与简单回归类似

## 回归诊断
# 线性回归假设:
# (1) 自变量与因变量关系的线性假设
# (2) 自变量相互独立
# (3) 误差项不相关
# (4) 方差齐性,即不存在异方差现象(heteroskedasticity)
# NOTE: 解决异方差性的通常方法是对因变量取对数
par(mfrow = c(2, 2))
plot(model3, which = c(1:4)) 

## 使用表格形式报告结果
# install.packages("stargazer")
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
# 输出text格式
stargazer(model1, model2, model3, type = "text")
## 
## =============================================================================================
##                                                Dependent variable:                           
##                     -------------------------------------------------------------------------
##                                                      lifeExp                                 
##                               (1)                      (2)                      (3)          
## ---------------------------------------------------------------------------------------------
## log(gdpPercap)              7.203***                 7.245***                4.642***        
##                             (0.442)                  (0.438)                  (0.538)        
##                                                                                              
## log(pop)                                             0.811**                   0.040         
##                                                      (0.389)                  (0.351)        
##                                                                                              
## continentAmericas                                                            11.656***       
##                                                                               (1.693)        
##                                                                                              
## continentAsia                                                                10.052***       
##                                                                               (1.578)        
##                                                                                              
## continentEurope                                                              11.232***       
##                                                                               (1.926)        
##                                                                                              
## continentOceania                                                             12.892***       
##                                                                               (4.549)        
##                                                                                              
## Constant                     4.950                    -8.616                 19.418**        
##                             (3.858)                  (7.538)                  (7.456)        
##                                                                                              
## ---------------------------------------------------------------------------------------------
## Observations                  142                      142                      142          
## R2                           0.654                    0.665                    0.767         
## Adjusted R2                  0.652                    0.660                    0.757         
## Residual Std. Error     7.122 (df = 140)         7.038 (df = 139)        5.951 (df = 135)    
## F Statistic         265.150*** (df = 1; 140) 137.925*** (df = 2; 139) 74.228*** (df = 6; 135)
## =============================================================================================
## Note:                                                             *p<0.1; **p<0.05; ***p<0.01
# 输出html格式并保存为lifeExp_model
# stargazer(model1, model2, model3, type = "html", out = "lifeExp_model.htm") 
# NOTE: Microsoft Word可以读入 *.htm文档, 并进行编辑修改

## 使用图形报告结果
# install.packages(c("dotwhisker", "broom"))
library(dotwhisker)
library(broom)
# 绘制模型3的点须图
dwplot(model3)

# 绘制模型1-3的点须图
dwplot(list(model1, model2, model3))

# 模型3黑白版的点须图
dwplot(model3, whisker_args = list(lwd = 0.8, color = "black"), 
       dot_args = list(size = 3, pch = 21, fill = "black", color = "black")) %>%
  relabel_predictors(c("log(gdpPercap)" = "GDP per capita(log)",
                       "log(pop)" = "population(log)",
                       "continentAmericas" = "Continent: Americas",
                       "continentAsia" = "Continent: Asia",
                       "continentEurope" = "Continent: Europe",
                       "continentOceania" = "Continent: Oceania")) +
  geom_vline(xintercept = 0, color = "grey60", linetype = 2) +
  xlab("Coefficient") + theme_bw() + 
  theme(legend.position = "none")